################################################################################
###
###   Replication code for Tables 1, 2, and 3 of Clarke, Kenkel, & Rueda 2016.
###   
###   NOTE: User must set working directory appropriately as described in 
###         attached README file. 
###
###   Created: 7/19/2016
###   Last modified: 7/19/2016
###
################################################################################

rm(list = ls())


setwd("~/Desktop/Replication")


library(Matching)
library(Hmisc)
library(boot)

#Functions to be used in estimation
nearestATT <- function(ps, tr, y)
{
  ans <- Match(Y = y, Tr = tr, X = ps, estimand = "ATT", BiasAdjust = FALSE,
               version = "fast", replace = TRUE, M = 1, distance.tolerance =
                 1e-8)
  return(ans$est)
}

blockATT <- function(ps, tr, y, strata = 10)
{
  ps <- cut2(ps, g = strata)
  ans <- tapply(y, list(ps, tr), mean)
  ans <- apply(ans, 1, diff)
  ans <- weighted.mean(ans, table(ps[tr == 1]), na.rm = TRUE)
  return(ans)
}

weightATT <- function(ps, tr, y)
{
  ps <- invlogit(ps)
  ans1<-mean(y[tr==1]) 
  ans2<-( sum( y[tr==0]*ps[tr==0]/(1-ps[tr==0]) )/ sum( ps[tr==0]/(1-ps[tr==0])) )
  return(ans1-ans2)
}

regressATE <- function(ps, tr, y)
{
  ps <- invlogit(ps)
  ans <- lm(y ~ tr + ps + I(tr * (ps - mean(ps))))
  return(coef(ans)[2])
}

caliperATT <- function(ps, tr, y, X, caliper = .25)
{
  caliper <- c(caliper, rep(100, ncol(X)))
  X <- cbind(ps, X)
  ans <- Match(Y = y, Tr = tr, X = X, estimand = "ATT", BiasAdjust = FALSE,
               caliper = caliper, Weight = 2, version = "fast", replace =
                 TRUE, M = 1, distance.tolerance = 1e-8)
  return(ans$est)
}


#Loading Lalonde's data
load("DWvsCPS.rda")

DWvsCPS <- transform(DWvsCPS,
                     age = age / 100,
                     education = education / 10,
                     u74 = as.numeric(re74 == 0),
                     u75 = as.numeric(re75 == 0))


attach(DWvsCPS)

#Setting data matrixes


X<-cbind(age,I(age^2),education,I(education^2),black,hispanic,married,nodegree,re74,re75,u74,u75,I(education*re74),I(age^3))
tr<-treated
y<-re78



#The function 'patt' generates a matrix that has: The coefficients of true outcome equations and true propensity score equations, correlation of omitted
#variables and the calculated ATT bias for specifications that have a given (Z,W) combination. 
#It takes as arguments:
#Z: Column number of the always omitted variable
#W: Column number of the ommitted variable
#X: Matrix with all control variables that have the 'true' specification see footnote page 1059 Dehejia and Wahba
#tr: Vector with treatment variable
#y: Outcome variable 
#cal: 1 if only checking signs for caliper matching, 0 all estimation methods

patt<-function(Z,W,X,tr,y,cal){
  
  if(Z==1|Z==2|Z==14){					#taking out age variables
    X2<-X[,-c(1,2,14)]
    X1<-X[,-c(1,2,14,W)]
  }
  else if(Z==3|Z==4|Z==13){				#taking out education variables
    X2<-X[,-c(3,4,13)]
    X1<-X[,-c(3,4,13,W)]
  }
  else if(Z==9|Z==11|Z==13){				#taking out income in 74 variables
    X2<-X[,-c(9,11,13)]
    X1<-X[,-c(9,11,13,W)]
  }
  else if(Z==10|Z==12){ 					#taking out income in 75 variables
    X2<-X[,-c(10,12)]
    X1<-X[,-c(10,12,W)]
  }
  else {							#taking out other variables
    X2<-X[,-Z]
    X1<-X[,-c(Z,W)]
  }
  
  pst<-glm(tr~X,family = binomial(link = "logit"))
  ps1<-glm(tr~X1,family = binomial(link = "logit"))
  ps2<-glm(tr~X2,family = binomial(link = "logit"))
  
  
  ps1f <- ps1$fitted.values	
  S1 <- ((ps1f >= min(ps1f[tr == 1]))&(ps1f <= max( ps1f[tr == 1])))	#Only taking observation with propensity score that are greater
  #than the minimum propensity score of the treated units but less than
  #the maximum ps of the treated
  
  ps2f <- ps2$fitted.values	
  S2 <- ((ps2f >= min(ps2f[tr == 1]))&(ps2f <= max( ps2f[tr == 1])))
  
  rho<-corr(cbind(X[,Z],X[,W]))
  
  out<-lm(y ~ X)										#'true' outcome equation 
  att <- 1794											#experimental benchmark for the ATT 
  
  ps1<-ps1$linear.pred
  ps2<-ps2$linear.pred	
  
  if(cal!=1){
    nea1 <- nearestATT(ps1[S1], tr[S1], y[S1])
    nea2 <- nearestATT(ps2[S2], tr[S2], y[S2])
    blo1 <- blockATT(ps1[S1], tr[S1], y[S1], strata = 10)
    blo2 <- blockATT(ps2[S2], tr[S2], y[S2], strata = 10)
    wei1 <- weightATT(ps1[S1], tr[S1], y[S1])
    wei2 <- weightATT(ps2[S2], tr[S2], y[S2])
    cal1 <- caliperATT(ps1[S1], tr[S1], y[S1], X = X1[S1,], caliper = 0.25)
    cal2 <- caliperATT(ps2[S2], tr[S2], y[S2], X = X2[S2,], caliper = 0.25)
    
    return(c(Z,W,pst$coeff[Z+1],out$coeff[Z+1],pst$coeff[W+1],out$coeff[W+1],rho,
             (abs(nea2-att)-abs(nea1-att)),(abs(blo2-att)-abs(blo1-att)),
             (abs(wei2-att)-abs(wei1-att)),0,(abs(cal2-att)-abs(cal1-att)),0))
    
  }	
  else{
    
    cal1 <- caliperATT(ps1[S1], tr[S1], y[S1], X = X1[S1,], caliper = 0.25)
    cal2 <- caliperATT(ps2[S2], tr[S2], y[S2], X = X2[S2,], caliper = 0.25)	
    return(c(Z,W,pst$coeff[Z+1],out$coeff[Z+1],pst$coeff[W+1],out$coeff[W+1],rho,
             (abs(cal2-att)-abs(cal1-att)),0))
  }
}




#The resBb matrix contains the bias of all possible (Z,W) combinations
a=1
resBb<-matrix(rep(0,9*(dim(X)[2])^2),9,(dim(X)[2])^2)
pb <- txtProgressBar(min = 2, max = dim(X)[2]^2 + 1)
for (i in 1:(dim(X)[2])){
  for (j in 1:(dim(X)[2])){
    pa<-patt(i,j,X,tr,y,1)	
    resBb[,a]<-matrix(pa,length(pa),1)
    a=a+1
    setTxtProgressBar(pb, a)
  }
}

#the signs function gives the signs of rho, gammaZ*betaZ, gammaW*betaW, increase in bias after including W
#for a vector that includes true coefficients and ATT as given by function 'patt'

signs<-function(res,cal){
  aux<-sign(c(res[3]*res[4],res[5]*res[6]))
  
  if (cal==1){
    r<-c(sign(res[7]),aux,sign(res[8]))
  }
  else{
    r<-c(sign(res[7]),aux,sign(res[8:12]))
  }
  return(r)
}

#The search_patt function returns the bias and true coefficients of (Z,W) combinations if 
#their rho, gammaZ*betaZ, gammaW*betaW and increase in bias follow a given sign pattern

search_patt<-function(pattern,res,cal){
  sig<-apply(res,2,signs,cal)
  
  for (i in 1:(dim(res)[2])){
    if (sum(abs(pattern-sig[,i]))==0){
      if (cal==1){res[9,i]<-1}
      else {res[13,i]<-1}
    }		
  }
  
  if (cal==1){return(res[,res[9,]==1])}
  else{return(res[,res[13,]==1])}
}



#The balance function checks balance after matching for specifications with and without W using caliper. 
#it returns the number of contorl variables whose pvalue in t test has increased, indicating and improvement in balance

Balance2<-function(Z,W,X,tr,y,ATE){
  
  #Remember: X1 is with W omitted and X2 is with W included.
  #Codes of variables (age 1,I(age^2) 2,education 3,education^2 4
  #,black 5,hispanic 6,married 7,nodegree 8, re74 9,re75 10,u74 11,u75 12,I(education*re74) 13, age^3 14)
  
  if(Z==1|Z==2|Z==14){
    X2<-X[,-c(1,2,14)]
    X1<-X[,-c(1,2,14,W)]
  }
  else if(Z==3|Z==4|Z==13){
    X2<-X[,-c(3,4,13)]
    X1<-X[,-c(3,4,13,W)]
  }
  else if(Z==9|Z==11|Z==13){
    X2<-X[,-c(9,11,13)]
    X1<-X[,-c(9,11,13,W)]
  }
  else if(Z==10|Z==12){
    X2<-X[,-c(10,12)]
    X1<-X[,-c(10,12,W)]
  }
  else {
    X2<-X[,-Z]
    X1<-X[,-c(Z,W)]
  }
  
  X1<-data.matrix(X1)
  X2<-data.matrix(X2)
  
  ps1<-glm(tr~X1,family = binomial(link = "logit"))
  ps2<-glm(tr~X2,family = binomial(link = "logit"))
  
  
  ps1f <- ps1$fitted.values	
  S1 <- ((ps1f >= min(ps1f[tr == 1]))&(ps1f <= max( ps1f[tr == 1])))
  
  ps2f <- ps2$fitted.values	
  S2 <- ((ps2f >= min(ps2f[tr == 1]))&(ps2f <= max( ps2f[tr == 1])))
  
  
  att <- 1794
  
  ps1<-ps1$linear.pred
  ps2<-ps2$linear.pred	
  
  if (ATE==1){
    EF<-"ATE"
  }
  else{
    EF<-"ATT"
  }
  caliper <- c(0.25, rep(100, ncol(X1[S1,])))
  X <- cbind(ps1[S1], X1[S1,])
  ans_1 <- Match(Y = y[S1], Tr = tr[S1], X = X, estimand = EF, BiasAdjust = FALSE,
                 caliper = caliper, Weight = 2, version = "fast", replace =
                   TRUE, M = 1, distance.tolerance = 1e-8)
  cal1<-ans_1$est
  
  bal1<-MatchBalance(tr[S1]~X1[S1,],data=X, match.out=ans_1, nboots=500,print.level=1, ks = FALSE)	
  
  p1<-matrix(rep(0,ncol(X1)),ncol(X1),1)
  for(i in 1:ncol(X1[S1,])){
    p1[i,1]<-bal1$AfterMatching[[i]]$p.value
  }
  p1<-data.frame(cbind(t(t(colnames(X1))),p1))
  names(p1)<-c("id","p1") 
  
  caliper <- c(0.25, rep(100, ncol(X2[S2,])))
  X <- cbind(ps2[S2], X2[S2,])
  ans_2 <- Match(Y = y[S2], Tr = tr[S2], X = X, estimand = EF, BiasAdjust = FALSE,
                 caliper = caliper, Weight = 2, version = "fast", replace =
                   TRUE, M = 1, distance.tolerance = 1e-8)      
  
  cal2<-ans_2$est	
  
  bal2<-MatchBalance(tr[S2]~X2[S2,],data=X2, match.out=ans_2, nboots=500,print.level=1, ks = FALSE)	
  
  p2<-matrix(rep(0,ncol(X2)),ncol(X2),1)
  for(i in 1:ncol(X2[S2,])){
    p2[i,1]<-bal2$AfterMatching[[i]]$p.value
  }
  p2<-data.frame(cbind(t(t(colnames(X2))),p2))
  names(p2)<-c("id","p2")      
  
  total<- merge(p1,p2,by="id")
  
  total$p1<-as.numeric(as.character(total$p1)) 
  total$p2<-as.numeric(as.character(total$p2)) 
  
  r1<-sum(1*(total$p1<=total$p2))
  
  result<-c(Z,W,(abs(cal2-att)-abs(cal1-att)),r1,ncol(X1),bal1$AMsmallest.p.value,bal2$AMsmallest.p.value)	
  return(result)
}

#Creating identified pattern matrices

patternA<-c(1,1,-1,1)
patternB<-c(1,-1,1,1)
patternC<-c(-1,1,-1,1)
patternD<-c(-1,-1,1,1)

A_ATT<-search_patt(patternA,resBb,1)
B_ATT<-search_patt(patternB,resBb,1)
C_ATT<-search_patt(patternC,resBb,1)
D_ATT<-search_patt(patternD,resBb,1)

#Balancing function
load("DWvsCPS.rda")

DWvsCPS <- transform(DWvsCPS,
                     age = age / 100,
                     education = education / 10,
                     u74 = as.numeric(re74 == 0),
                     u75 = as.numeric(re75 == 0))

X<-data.frame(cbind(DWvsCPS$age,I(DWvsCPS$age^2),DWvsCPS$education,I(DWvsCPS$education^2),DWvsCPS$black,DWvsCPS$hispanic,DWvsCPS$married,DWvsCPS$nodegree,DWvsCPS$re74,DWvsCPS$re75,DWvsCPS$u74,DWvsCPS$u75,I(DWvsCPS$education*DWvsCPS$re74),I(DWvsCPS$age^3)))
names(X)<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,14)

tr<-DWvsCPS$treated
y<-DWvsCPS$re78

#For Lalonde A_ATT2 is one dimensional change if necessary for other datasets
A_ATT2<-matrix(rep(0,7*1),7,1)
for (i in 1:1){
  A_ATT2[,i]<-Balance2(Z=A_ATT[1],W=A_ATT[2],X,tr,y,ATE=0)
}

B_ATT2<-matrix(rep(0,7*dim(B_ATT)[2]),7,dim(B_ATT)[2])
for (i in 1:dim(B_ATT)[2]){
  B_ATT2[,i]<-Balance2(B_ATT[1,i],B_ATT[2,i],X,tr,y,ATE=0)
}

C_ATT2<-matrix(rep(0,7*dim(C_ATT)[2]),7,dim(C_ATT)[2])
for (i in 1:dim(C_ATT)[2] ){
  C_ATT2[,i]<-Balance2(C_ATT[1,i],C_ATT[2,i],X,tr,y,ATE=0)
}

D_ATT2<-matrix(rep(0,7*dim(D_ATT)[2]),7,dim(D_ATT)[2])
for (i in 1:dim(D_ATT)[2]){
  D_ATT2[,i]<-Balance2(D_ATT[1,i],D_ATT[2,i],X,tr,y,ATE=0)
}


#Here are the results for the table


#Organizing results without duplicates
elim_duplicates<-function(A,c){
  A_aux<-A[order(A[,c]),]
  A_aux<-A_aux[duplicated(A_aux[,c])==FALSE,]
  return(A_aux)
}

resBb_nd<-elim_duplicates(t(resBb),8)
A_o<-t(A_ATT2)
B_o<-elim_duplicates(t(B_ATT2),3)
C_o<-elim_duplicates(t(C_ATT2),3)
D_o<-elim_duplicates(t(D_ATT2),3)

AC<-rbind(A_o,C_o)
BD<-rbind(B_o,D_o)

AC[,3]<-AC[,3]/1794
BD[,3]<-BD[,3]/1794

#number of cases where bias is increased and there are countervailing effects
num_pos_neg<-dim(AC)[1]
num_neg_pos<-dim(BD)[1]

AC<-cbind(t(t(rep("+_-",num_pos_neg))),AC)
BD<-cbind(t(t(rep("-_+",num_neg_pos))),BD)

table1<-rbind(AC,BD)
table1<-table1[,1:6]

colnames(table1)<-c("sgn_cu_cx","u","x","change_bias_att","#var_imp_bal","#vars")


#Adding names of variables
colnames(X)<-c("age","age2","education","education2","black","hispanic","married","nodegree","re74","re75","u74","u75","educationre74","age3")
table1 <- as.data.frame(as.matrix(table1)) 
varindices <- names(as.data.frame(X))
varindices <- as.data.frame(cbind(varindices, c(1:length(varindices))))

names(varindices) <- c("var_x", "x")
table1 <- merge(varindices,table1,by="x")
names(varindices) <- c("var_u", "u")
table1 <- merge(varindices,table1,by="u")

table1$x<-NULL
table1$u<-NULL

table1<-table1[order(table1$change_bias_att,decreasing= TRUE),]

#Reported table
table1[,c(3,1,2,4,5,6)]
sink()


################################################################################
###
###  Begin Replication of Table 2
###
################################################################################

require(foreign)

# Load in the data
mckenzie <- read.dta("jeeafile1.dta")

# Cleaning the data to match observations from original study
# this subsetting is based on the subset performed in McKenzie .do file, beginning
# on line 47. 
indices <- intersect(which(mckenzie$NZdata==1), which(mckenzie$princapp==1))
indices <- c(indices, which(mckenzie$group3==1))
indices <- sort(unique(indices))
mdata <- mckenzie[indices,]
mdata <- mdata[,c("workincome",
                  "migrate",
                  "male",
                  "married",
                  "age",
                  "age2",
                  "age3",
                  "age4",
                  "years",
                  "years2",
                  "years3",
                  "years4",
                  "bornTongatapu",
                  "height",
                  "pastincome",
                  "maleage",
                  "maleyears",
                  "malemarried",
                  "maleTT",
                  "malepast",
                  "maleheight",
                  "ageyears")]
mdata <- mdata[complete.cases(mdata),]
rm(list = c("mckenzie", "indices"))
attach(mdata)

# Setting data matrices.
# Note, this uses the propensity score specification from model "C" which appears
# in table 5 of McKenzie et al paper. 

X <- cbind(male,
           married,
           age,age2,age3,age4,
           years, years2,years3,years4,
           bornTongatapu,
           height,
           pastincome,
           maleage,
           maleyears,
           malemarried,
           maleTT,
           malepast,
           maleheight,
           ageyears)
tr <- migrate
y <- workincome


# Before defining function "patt," we'll create objects with indices of interacted
# and higher-order variables to exclude by group to keep the function relatively clean
imale <- c(1,14, 15, 16, 17, 18, 19)
iage <- c(3, 4, 5, 6, 14, 20)
iyears <- c(7, 8, 9, 10, 15, 20)
imarried <- c(2, 16)
iTT <- c(11, 17)
iheight <- c(12, 19)
ipastincome <- c(13, 18)


#The function 'patt' generates a matrix that has: The coefficients of true outcome 
#equations and true propensity score equations, correlation of omitted
#variables and the calculated ATT bias for specifications that have a given (Z,W) combination. 
#It takes as arguments:
#Z: Column number of the always omitted variable
#W: Column number of the ommitted variable
#X: Matrix with all control variables that have the 'true' specification see footnote page 1059 Dehejia and Wahba
#tr: Vector with treatment variable
#y: Outcome variable 
#cal: 1 if only checking signs for caliper matching, 0 all estimation methods

patt<-function(Z,W,X,tr,y,cal){
  
  # Each if statement removes the corresponding variable and all interacted 
  # variables, building W and Z appropriately
  
  if(Z %in% imale){
    X2 <- X[,-imale]
    X1 <- X[,-c(imale, W)]
  }
  else if(Z %in% iage){
    X2 <- X[, -iage]
    X1 <- X[, -c(iage,W)]
  }
  else if(Z %in% iyears){
    X2 <- X[, -iyears]
    X1 <- X[, -c(iyears, W)]
  }
  else if(Z %in% imarried){
    X2 <- X[, -imarried]
    X1 <- X[, -c(imarried, W)]
  }
  else if(Z %in% iTT){
    X2 <- X[, -iTT]
    X1 <- X[, -c(iTT, W)]
  }
  else if(Z %in% iheight){
    X2 <- X[, -iheight]
    X1 <- X[, -c(iheight, W)]
  }
  else if(Z %in% ipastincome){
    X2 <- X[, -ipastincome]
    X1 <- X[, -c(ipastincome, W)]
  }
  else {
    X2 <- X[,-Z]
    X1 <- X[, -c(Z,W)]
  }
  
  pst<-glm(tr~X,family = binomial(link = "logit"))
  ps1<-glm(tr~X1,family = binomial(link = "logit"))
  ps2<-glm(tr~X2,family = binomial(link = "logit"))
  
  
  ps1f <- ps1$fitted.values	
  S1 <- ((ps1f >= min(ps1f[tr == 1]))&(ps1f <= max( ps1f[tr == 1])))	#Only taking observation with propensity score that are greater
  #than the minimum propensity score of the treated units but less than
  #the maximum ps of the treated
  
  ps2f <- ps2$fitted.values	
  S2 <- ((ps2f >= min(ps2f[tr == 1]))&(ps2f <= max( ps2f[tr == 1])))
  
  rho<-corr(cbind(X[,Z],X[,W]))
  
  out<-lm(y ~ X)										#'true' outcome equation 
  att <- 274											#experimental benchmark for the ATT 
  
  ps1<-ps1$linear.pred
  ps2<-ps2$linear.pred	
  
  if(cal!=1){
    nea1 <- nearestATT(ps1[S1], tr[S1], y[S1])
    nea2 <- nearestATT(ps2[S2], tr[S2], y[S2])
    blo1 <- blockATT(ps1[S1], tr[S1], y[S1], strata = 10)
    blo2 <- blockATT(ps2[S2], tr[S2], y[S2], strata = 10)
    wei1 <- weightATT(ps1[S1], tr[S1], y[S1])
    wei2 <- weightATT(ps2[S2], tr[S2], y[S2])
    cal1 <- caliperATT(ps1[S1], tr[S1], y[S1], X = X1[S1,], caliper = 0.25)
    cal2 <- caliperATT(ps2[S2], tr[S2], y[S2], X = X2[S2,], caliper = 0.25)
    
    return(c(Z,W,pst$coeff[Z+1],out$coeff[Z+1],pst$coeff[W+1],out$coeff[W+1],rho,
             (abs(nea2-att)-abs(nea1-att)),(abs(blo2-att)-abs(blo1-att)),
             (abs(wei2-att)-abs(wei1-att)),0,(abs(cal2-att)-abs(cal1-att)),0))
    
  }	
  else{
    
    cal1 <- caliperATT(ps1[S1], tr[S1], y[S1], X = X1[S1,], caliper = 0.25)
    cal2 <- caliperATT(ps2[S2], tr[S2], y[S2], X = X2[S2,], caliper = 0.25)	
    return(c(Z,W,pst$coeff[Z+1],out$coeff[Z+1],pst$coeff[W+1],out$coeff[W+1],rho,
             (abs(cal2-att)-abs(cal1-att)),0))
  }
}




#The resBb matrix contains the bias of all possible (Z,W) combinations
a=1
resBb<-matrix(rep(0,9*(dim(X)[2])^2),9,(dim(X)[2])^2)
pb <- txtProgressBar(min = 2, max = dim(X)[2]^2 + 1)
for (i in 1:(dim(X)[2])){
  for (j in 1:(dim(X)[2])){
    pa<-patt(i,j,X,tr,y,1)	
    resBb[,a]<-matrix(pa,length(pa),1)
    a=a+1
    setTxtProgressBar(pb, a)
  }
}

#the signs function gives the signs of rho, gammaZ*betaZ, gammaW*betaW, increase in bias after including W
#for a vector that includes true coefficients and ATT as given by function 'patt'

signs<-function(res,cal){
  aux<-sign(c(res[3]*res[4],res[5]*res[6]))
  
  if (cal==1){
    r<-c(sign(res[7]),aux,sign(res[8]))
  }
  else{
    r<-c(sign(res[7]),aux,sign(res[8:12]))
  }
  return(r)
}

#The search_patt function returns the bias and true coefficients of (Z,W) combinations if 
#their rho, gammaZ*betaZ, gammaW*betaW and increase in bias follow a given sign pattern

search_patt<-function(pattern,res,cal){
  sig<-apply(res,2,signs,cal)
  
  for (i in 1:(dim(res)[2])){
    if (sum(abs(pattern-sig[,i]))==0){
      if (cal==1){res[9,i]<-1}
      else {res[13,i]<-1}
    }		
  }
  
  if (cal==1){return(res[,res[9,]==1])}
  else{return(res[,res[13,]==1])}
}


pattern<-c(1,1,-1,1)
A<-search_patt(pattern,resBb,1)

pattern<-c(1,-1,1,1)
B<-search_patt(pattern,resBb,1)

pattern<-c(-1,1,-1,1)
C<-search_patt(pattern,resBb,1)

pattern<-c(-1,-1,1,1)
D<-search_patt(pattern,resBb,1)

#Balancing function modified criteria (for half of the variables the p value needs to have increased)

#The balance function checks balance after matching for specifications with and without W using caliper. 
#it returns the number of contorl variables whose pvalue in t test has increased, indicating and improvement in balance

Balance2<-function(Z,W,X,tr,y,ATE){
  
  #Remember: X1 is with W omitted and X2 is with W included.
  # Variable names are stored in the object varindices, which is generated below.
  
  if(Z %in% imale){
    X2 <- X[,-imale]
    X1 <- X[,-c(imale, W)]
  }
  else if(Z %in% iage){
    X2 <- X[, -iage]
    X1 <- X[, -c(iage,W)]
  }
  else if(Z %in% iyears){
    X2 <- X[, -iyears]
    X1 <- X[, -c(iyears, W)]
  }
  else if(Z %in% imarried){
    X2 <- X[, -imarried]
    X1 <- X[, -c(imarried, W)]
  }
  else if(Z %in% iTT){
    X2 <- X[, -iTT]
    X1 <- X[, -c(iTT, W)]
  }
  else if(Z %in% iheight){
    X2 <- X[, -iheight]
    X1 <- X[, -c(iheight, W)]
  }
  else if(Z %in% ipastincome){
    X2 <- X[, -ipastincome]
    X1 <- X[, -c(ipastincome, W)]
  }
  else {
    X2 <- X[,-Z]
    X1 <- X[, -c(Z,W)]
  }
  
  X1<-data.matrix(X1)
  X2<-data.matrix(X2)
  
  ps1<-glm(tr~X1,family = binomial(link = "logit"))
  ps2<-glm(tr~X2,family = binomial(link = "logit"))
  
  
  ps1f <- ps1$fitted.values	
  S1 <- ((ps1f >= min(ps1f[tr == 1]))&(ps1f <= max( ps1f[tr == 1])))
  
  ps2f <- ps2$fitted.values	
  S2 <- ((ps2f >= min(ps2f[tr == 1]))&(ps2f <= max( ps2f[tr == 1])))
  
  
  att <- 274
  
  ps1<-ps1$linear.pred
  ps2<-ps2$linear.pred	
  
  if (ATE==1){
    EF<-"ATE"
  }
  else{
    EF<-"ATT"
  }
  caliper <- c(0.25, rep(100, ncol(X1[S1,])))
  X <- cbind(ps1[S1], X1[S1,])
  ans_1 <- Match(Y = y[S1], Tr = tr[S1], X = X, estimand = EF, BiasAdjust = FALSE,
                 caliper = caliper, Weight = 2, version = "fast", replace =
                   TRUE, M = 1, distance.tolerance = 1e-8)
  cal1<-ans_1$est
  
  bal1<-MatchBalance(tr[S1]~X1[S1,],data=X, match.out=ans_1, nboots=500,print.level=1, ks = FALSE)	
  
  p1<-matrix(rep(0,ncol(X1)),ncol(X1),1)
  for(i in 1:ncol(X1[S1,])){
    p1[i,1]<-bal1$AfterMatching[[i]]$p.value
  }
  p1<-data.frame(cbind(t(t(colnames(X1))),p1))
  names(p1)<-c("id","p1") 
  
  caliper <- c(0.25, rep(100, ncol(X2[S2,])))
  X <- cbind(ps2[S2], X2[S2,])
  ans_2 <- Match(Y = y[S2], Tr = tr[S2], X = X, estimand = EF, BiasAdjust = FALSE,
                 caliper = caliper, Weight = 2, version = "fast", replace =
                   TRUE, M = 1, distance.tolerance = 1e-8)      
  
  cal2<-ans_2$est	
  
  bal2<-MatchBalance(tr[S2]~X2[S2,],data=X2, match.out=ans_2, nboots=500,print.level=1, ks = FALSE)	
  
  p2<-matrix(rep(0,ncol(X2)),ncol(X2),1)
  for(i in 1:ncol(X2[S2,])){
    p2[i,1]<-bal2$AfterMatching[[i]]$p.value
  }
  p2<-data.frame(cbind(t(t(colnames(X2))),p2))
  names(p2)<-c("id","p2")      
  
  total<- merge(p1,p2,by="id")
  
  total$p1<-as.numeric(as.character(total$p1)) 
  total$p2<-as.numeric(as.character(total$p2)) 
  
  r1<-sum(1*(total$p1<=total$p2))
  
  result<-c(Z,W,(abs(cal2-att)-abs(cal1-att)),r1,ncol(X1),bal1$AMsmallest.p.value,bal2$AMsmallest.p.value)	
  return(result)
}

#Creating identified pattern matrices

patternA<-c(1,1,-1,1)
patternB<-c(1,-1,1,1)
patternC<-c(-1,1,-1,1)
patternD<-c(-1,-1,1,1)

A_ATT<-search_patt(patternA,resBb,1)
B_ATT<-search_patt(patternB,resBb,1)
C_ATT<-search_patt(patternC,resBb,1)
D_ATT<-search_patt(patternD,resBb,1)

#Balancing function

A_ATT2<-matrix(rep(0,7*dim(A_ATT)[2]),7,dim(A_ATT)[2])
#rownames(A_ATT2)<-c("Z","W","|c2-att|-|c1-att|","#var_imp_bal","#vars","small_pval1","small_pval2")
for (i in 1:dim(A_ATT)[2]){
  A_ATT2[,i]<-Balance2(Z=A_ATT[1,i],W=A_ATT[2,i],X,tr,y,ATE=0)
}

B_ATT2<-matrix(rep(0,7*dim(B_ATT)[2]),7,dim(B_ATT)[2])
for (i in 1:dim(B_ATT)[2]){
  B_ATT2[,i]<-Balance2(B_ATT[1,i],B_ATT[2,i],X,tr,y,ATE=0)
}

C_ATT2<-matrix(rep(0,7*dim(C_ATT)[2]),7,dim(C_ATT)[2])
for (i in 1:dim(C_ATT)[2] ){
  C_ATT2[,i]<-Balance2(C_ATT[1,i],C_ATT[2,i],X,tr,y,ATE=0)
}

D_ATT2<-matrix(rep(0,7*dim(D_ATT)[2]),7,dim(D_ATT)[2])
for (i in 1:dim(D_ATT)[2]){
  D_ATT2[,i]<-Balance2(D_ATT[1,i],D_ATT[2,i],X,tr,y,ATE=0)
}


# Add an object that will bind variable names of Z and W, rather than just
# their indices for ease of interpretation for the table. 
#varindices <- names(as.data.frame(X))
#varindices <- as.data.frame(cbind(varindices, c(1:length(varindices))))
#names(varindices) <- c("variable", "index")
#Here are the results for the table

A_ATT2 <- t(A_ATT2)
B_ATT2 <- t(B_ATT2)
C_ATT2 <- t(C_ATT2)
D_ATT2 <- t(D_ATT2)


#Organizing results without duplicates
elim_duplicates<-function(A,c){
  A_aux<-A[order(A[,c]),]
  A_aux<-A_aux[duplicated(A_aux[,c])==FALSE,]
  return(A_aux)
}

A_o<-as.matrix(elim_duplicates(A_ATT2,3))
B_o<-as.matrix(elim_duplicates(B_ATT2,3))
C_o<-t(as.matrix(elim_duplicates(C_ATT2,3)))
D_o<-as.matrix(elim_duplicates(D_ATT2,3))

AC<-rbind(A_o,C_o)
BD<-rbind(B_o,D_o)

AC[,3]<-AC[,3]/274
BD[,3]<-BD[,3]/274


#number of cases where bias is increased and there are countervailing effects
num_pos_neg<-dim(AC)[1]
num_neg_pos<-dim(BD)[1]

AC<-cbind(t(t(rep("+_-",num_pos_neg))),AC)
BD<-cbind(t(t(rep("-_+",num_neg_pos))),BD)

table2<-rbind(AC,BD)
table2<-table2[,1:6]

colnames(table2)<-c("sgn_cu_cx","u","x","change_bias_att","#var_imp_bal","#vars")


#Adding names of variables

table2 <- as.data.frame(as.matrix(table2)) 
varindices <- names(as.data.frame(X))
varindices <- as.data.frame(cbind(varindices, c(1:length(varindices))))

names(varindices) <- c("var_x", "x")
table2 <- merge(varindices,table2,by="x")
names(varindices) <- c("var_u", "u")
table2 <- merge(varindices,table2,by="u")

table2$x<-NULL
table2$u<-NULL

table2<-table2[order(table2$change_bias_att,decreasing= TRUE),]

#eliminating row with very small change in bias
table2<-table2[2:29,]

#Reported table
table2[,c(3,1,2,4,5,6)]


################################################################################
###
###  Begin replication of table 3
###
################################################################################

rm(list = setdiff(ls(), c("table1", "table2")))

library(foreign)
library(Hmisc)
library(boot)
library(xtable)
library(ggm)
library(Matching)
library(gam)
library(car)

set.seed(19)

sessionInfo()



#setwd("~/Google Drive/Overadjustment/McKenzieBalance")
#setwd("~/Desktop/CKR_Brad/sensitivity 2")
load("DWvsCPS.rda")

DWvsCPS <- transform(DWvsCPS,
                     age = age / 100,
                     education = education / 10,
                     u74 = as.numeric(re74 == 0),
                     u75 = as.numeric(re75 == 0))


#Define the value of rho
level<-0.05

#Define the treatment variable

treat <- "treated"
resp <- "re78"

# Create the interaction variables
DWvsCPS$age2 <- DWvsCPS$age^2
DWvsCPS$age3 <- DWvsCPS$age^3
DWvsCPS$education2 <- DWvsCPS$education^2
DWvsCPS$educationre74 <- DWvsCPS$education * DWvsCPS$re74

list <- c("re78",
          "treated",
          "age",
          "age2",
          "age3",
          "education",
          "education2",
          "black",
          "hispanic",
          "married",
          "nodegree",
          "re74",
          "re75",
          "u74",
          "u75",
          "educationre74")

mort <- DWvsCPS

mort.sub <- subset(mort, select=list)
rhc <-mort.sub[complete.cases(mort.sub),]



rhcmod = lm(re78~.,data=rhc)


dat <- rhc

# Rearrange reduced data (dat) so that outcome variable is in the last column,
# and the treatment is in the first column. 
#This procedure should be general to 
# adding or subtracting variables, so long as the names of the treatment and response
# variables remain unchanged. 
col_treat <- grep(treat,names(dat))
col_resp <- grep(resp,names(dat))
dat <- dat[,c(col_treat, (1:ncol(dat))[-c(col_treat,col_resp)] ,col_resp)]


#Change names of treatment and response variables
names(dat)[grep(treat,names(dat))] <- "treat"
names(dat)[grep(resp,names(dat))] <- "resp"

newmod = lm(resp ~ ., data=dat)



get.tboot2 = function(dat,n)
{
  boot.rhc = function(data, indices) {
    data = data[indices,]
    newmod = lm(resp ~ .,
                data = data)
    summary(newmod)$coef[2,1:2]}
  
  bstrap = boot(dat, boot.rhc, n)
  
  tboot = rep(NA,n)
  
  for(i in 1:n)
  {
    tboot[i] = (bstrap$t[i,1]-bstrap$t0[1])/bstrap$t[i,2]
  }
  return(tboot)
}

tb2 = sort(abs(get.tboot2(dat,5000)))
t95 = tb2[ceiling(.95*5000)]


pmod <- glm(treated ~ age +
              age2+
              age3+
              education+
              education2+
              black+
              hispanic+
              married+
              nodegree+
              re74+
              re75+
              u74+
              u75+
              educationre74, family = 'binomial', data = rhc)

pscores = pmod$linear.predictor


pstrat.split = function(pscores, n){
  quant = as.vector(quantile(pscores, probs = c(seq(0,1, by=(1/n)))))
  pstrat = rep(0,length(pscores))
  
  for(j in 1:n){
    for(i in 1:length(pscores)){
      if((pscores[i]>=quant[j])&&(pscores[i]<quant[j+1])){
        pstrat[i]=j}
      if(pscores[i]==quant[n+1]){
        pstrat[i]=n}}}
  
  return(pstrat)}

psform = as.formula("treated ~ age +
                    age2+
                    age3+
                    education+
                    education2+
                    black+
                    hispanic+
                    married+
                    nodegree+
                    re74+
                    re75+
                    u74+
                    u75+
                    educationre74")
##Process of omitting covs from pscore

list=c(all.vars(psform)[2:length(all.vars(psform))])
dat = rhc$treated

for(i in 1:length(list)){
  a = rhc[which(names(rhc)==list[i])]
  dat = cbind(dat, a)
}
dat$treated = rhc$treated
dat = dat[,-1]


specpars.ps = function(rhc, dat, var.index, n, type){
  newdat = dat[,-var.index]
  
  pmod2 = glm(treated ~ ., family = "binomial", data=newdat)
  pscores = pmod2$linear.predictor
  pstrat = as.factor(pstrat.split(pscores, n))
  
  dat$re78 = rhc$re78
  form1 <- as.formula(paste("re78", "~", 
                            paste(c("treated","pstrat",colnames(dat)[var.index]), collapse = "+"),
                            sep = ""
  ))
  trtmod <- lm(form1, data = dat)
  r.with = summary(trtmod)$r.squared
  b.add = summary(trtmod)$coef[2,1]
  se.b.add = summary(trtmod)$coef[2,2]
  
  trtmod2 = lm(re78~treated+pstrat, data=dat)
  r.without = summary(trtmod2)$r.squared
  b = summary(trtmod2)$coef[2,1]
  se.b = summary(trtmod2)$coef[2,2]
  
  r.par = (r.with - r.without)/(1-r.without)
  
  if(type==1){
    form2 <- as.formula(paste("re78", "~", 
                              paste(c(colnames(dat)[var.index],"pstrat"), collapse = "+"),
                              sep = ""
    ))
    bigmod = lm(form2, data = dat)
    t.w = summary(bigmod)$coef[2,3]
    df = dim(newdat)[1]-n-3
  }
  
  if(type==2){
    smmod = lm(re78~pstrat, data=dat)
    form3 <- as.formula(paste("re78", "~", 
                              paste(c(colnames(dat)[var.index],"pstrat"), collapse = "+"),
                              sep = ""
    ))
    bigmod = lm(form3,data = dat)
    Fw=((deviance(smmod)-deviance(bigmod))/(deviance(bigmod)/df.residual(bigmod))) 
    df = summary(bigmod)$df[2] 
    k= length(levels(dat[,j]))-1 
    t.w = sqrt((k*df/(df+1-k))*Fw)
  }
  
  specpars = cbind(r.par, t.w, b, se.b, df, b.add, se.b.add)
  return(specpars)
}

# Because there are interaction terms, need to be careful with removing groups 
# of variables rather than one at a time

spec.ps <- matrix(0,8,7)

# First, omit age variables
type = as.numeric((length(levels(dat[,1]))>2))+1
spec.ps[1,] <- specpars.ps(rhc, dat, c(1,2,3), 6, type)

# omit education variables
type = as.numeric((length(levels(dat[,4]))>2))+1
spec.ps[2,] <- specpars.ps(rhc, dat, c(4, 5, 14), 6, type)

# omit race indicator Black
type = as.numeric((length(levels(dat[,6]))>2))+1
spec.ps[3,] <- specpars.ps(rhc, dat, 6, 6, type)

# omit race indicator hispanic
type = as.numeric((length(levels(dat[,7]))>2))+1
spec.ps[4,] <- specpars.ps(rhc, dat, 7, 6, type)

# omit married indicator
type = as.numeric((length(levels(dat[,8]))>2))+1
spec.ps[5,] <- specpars.ps(rhc, dat, 8, 6, type)

# omit no degree indicator
type = as.numeric((length(levels(dat[,9]))>2))+1
spec.ps[6,] <- specpars.ps(rhc, dat, 9, 6, type)

#omit income in 74
type = as.numeric((length(levels(dat[,10]))>2))+1
spec.ps[7,] <- specpars.ps(rhc, dat, c(10, 12, 14), 6, type)

#omit income in 75
type = as.numeric((length(levels(dat[,11]))>2))+1
spec.ps[8,] <- specpars.ps(rhc, dat, c(11,13), 6, type)


spec.ps = as.data.frame(round(spec.ps, 4))

names(spec.ps) = c("r.par", "t.w", "b", "se.b", "df", "add.b", "add.se.b")

list <- c("age",
          "education",
          "black",
          "hispanic",
          "married",
          "no degree",
          "income 74",
          "income 75")

row.names(spec.ps) = c("age",
                       "education",
                       "black",
                       "hispanic",
                       "married",
                       "no degree",
                       "income 74",
                       "income 75")


make.ci = function(dat, specpars, index, tquant, r.par)
{
  T = abs(as.numeric(specpars[index,2]))
  k = length(levels(dat[,index]))-1
  df = specpars[index,5]
  
  g = (T^2*(df+k))/(T^2*(df+k) + tquant^2*(T^2+df))
  
  if(r.par<=g)
  {ci = cor2(dat, specpars, index, tquant, r.par)}
  if(r.par>g)
  {ci = cor1(dat, specpars, index, tquant)}
  return(ci)
}


cor2 = function(dat, specpars, index, tquant, r.par)
{
  T = abs(as.numeric(specpars[index,2]))
  df = specpars[index,5]
  b = specpars[index,3]
  se.b = specpars[index,4]
  
  adj1 = T*sqrt(r.par)
  adj2 = tquant*sqrt(((T^2)+df)/(df-1))*sqrt(1-r.par)
  adj = adj1+adj2
  
  lb = b - adj*se.b
  ub = b + adj*se.b
  
  ci = cbind(lb,ub)
  return(ci)
} 



col2 = matrix(0,length(list),2)
for(i in 1:length(list)){
  col2[i,] = cor2(dat, spec.ps, i, t95, .01)
}
row.names(col2) = list



table3 <- col2
table3 <- as.data.frame(table3)
names(table3) <- c("Lower Bound", "Upper Bound")
View(table3)

rm(list = setdiff(ls(), c("table1", "table2", "table3")))


